### ---- SCRIPT 02: The purpose of this script is to analyse and visualise the most commonly ---- ###
### ---- words, hashtags, @mentions and emojis in the profile descriptions of ordinary Twitter ---- ###
### ---- users split by their ideal point positions. It is important to note that emojis do not ---- ###
### ---- appear correctly on Windows machines. This especially the case for national flags which are ---- ###
### ---- not supported on Windows. This script was ran on Linux for the figures presented in the ---- ###
### ---- accompanying paper.                                                                       ---- ###

# Suppress scientific notation
options(scipen=999)

#### ---- LIBRARIES ---- ####

library(dplyr) # data wrangling
library(ggplot2) # data visualisation
library(data.table) # easier to work with large datasets
library(tidytext) # working with textual data
library(stringr) # working with textual data
library(emoji) # working with emojis
library(tidyr) # data reshaping

library(devtools) # needed to install the emo package from github
# devtools::install_github("hadley/emo")
library(emo) # working with emojis

#### ---- DIRECTORIES ---- ####

raw_dir <- "00-raw_data/"
processed_dir <- "01-processed_data/"
figure_dir <- "02-figures/"

#### ---- LOAD DATA ---- ####

# Twitter user data 
user_data <- fread(paste0(processed_dir, "user_ideal_point_data_updated.csv"))

#### ---- TWITTER DESCRIPTION PREPROCESSING ---- ####

# Clean and tokenise the Twitter descriptions before analysis
twitter_bio_processed <- user_data %>%
  
  # Select the relevant columns
  select(description,user_ideal_point) %>%
  
  # Extract emojis 
  mutate(emojis = emoji_extract_all(user_data$description)) %>%
  
  # Remove URL and HTML entities
  mutate(bios_cleaned = str_replace_all(description, "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&amp;|&lt;|&gt;|RT|https", "")) %>%
  
  # Remove punctuation (except # and @)
  mutate(bios_cleaned = str_replace_all(bios_cleaned, "[[:punct:]&&[^#@]]", ""))

rm(user_data) # free memory

twitter_bio_processed <- twitter_bio_processed %>%
  # Tokenize the text (converts to lowercase)
  unnest_tokens(word, bios_cleaned, token = "regex", pattern = "([^A-Za-z_\\d#@'])")

twitter_bio_processed <- twitter_bio_processed %>%
  # Remove stopwords
  filter(!word %in% stop_words$word,
         str_detect(word, "[a-z]"))
 
#### ---- SUMMARISE THE BIO TEXT BY IDEOLOGICAL CATEGORY ---- ####

# Create a function that calculates the top n recurring text types (words,hashtags,mentions,emojis) in Twitter bios
recurring_text_types <- function(text_data,text_type = c("words","hashtags","mentions","emojis"),top_n){
  
  recurring_text_type_df <- text_data %>%
    count(word, sort = TRUE) %>%
    mutate(word = reorder(word, n)) 
  
  if (text_type == "words"){
    
    recurring_type_df <- recurring_text_type_df %>% 
      
      filter(substr(word, 1, 1) != '#' & substr(word, 1, 1) != '@')
  } 
  else if (text_type == "hashtags"){
    
    recurring_type_df <- recurring_text_type_df %>% 
      
      filter(substr(word, 1, 1) == "#")
  }
  else if (text_type == "mentions"){
    
    recurring_type_df <- recurring_text_type_df %>% 
      
      filter(substr(word, 1, 1) == "@")
  }
  else {
  
  recurring_type_df  <- text_data %>%
    select(-word) %>%
    distinct() %>%
    unnest(emojis) 
    
  recurring_type_df$emojis[recurring_type_df$emojis == emo::ji("white_flag")] <- "🏳️‍⚧️"
  recurring_type_df$emojis[recurring_type_df$emojis == "⚧️"] <- NA
    
  recurring_type_df <- recurring_type_df %>%
    filter(!(is.na(emojis))) %>%
    count(emojis, sort = TRUE) %>%
    mutate(emojis = reorder(emojis, n)) %>%
    rename(word = emojis)
  
  }
  
  recurring_type_df <- head(recurring_type_df,top_n)
  
  return(recurring_type_df)
}

# Split the dataset into ideological categories based on ideal points: Left-wing accounts bottom 3rd,
# Right-wing highest third. Then, extract the 20 most commonly recurring text types in Twitter bios from each ideological
# group

text_types <- c("words","hashtags","mentions","emojis")

ideological_group <- list("Left" = c(0/3,1/3),
                          "Right" = c(2/3,3/3))

recurring_bio_text_summary <- data.frame()

for (x in (1:length(ideological_group))){
  
  ideal_point_range <- ideological_group[x]
  
  group <- names(ideal_point_range)
  
  bounds <- quantile(twitter_bio_processed$user_ideal_point, 
                     probs = ideal_point_range[[1]], na.rm = TRUE)
  
  lower_bound <- bounds[[1]]
  upper_bound <- bounds[[2]]
  
  text_data <- twitter_bio_processed %>% 
    
    filter(user_ideal_point >= lower_bound & user_ideal_point <= upper_bound)
  
  for (text_type in text_types){
    
    recurring_text_df <- recurring_text_types(text_data,text_type,20)
    
    recurring_text_df$text_type <- text_type
    
    recurring_text_df$group <- group
    
    recurring_bio_text_summary <- rbind(recurring_bio_text_summary,recurring_text_df)
  }
}

# Plot the different text types (top 20) for each ideological category as faceted lollipop plots

# Left wing accounts
left_wing_accounts <- recurring_bio_text_summary %>%
  filter(group == "Left") %>%
  mutate(text_type = recode(text_type, "emojis" = "Emojis", "words" = "Words", "hashtags" = "Hashtags", "mentions" =  'Mentions', "emojis" = "Emojis" ))

left_wing_accounts <- left_wing_accounts %>%
  mutate(rank = nrow(left_wing_accounts) - row_number() + 1)

left_wing_text_plot <- ggplot(left_wing_accounts, aes(x = rank, y = n)) +
  geom_segment(aes(xend = rank, yend = 0), size = 1, color = "red") +
  geom_point(size = 3, color = "red") +
  coord_flip() +
  facet_wrap(~ text_type, scales = "free", ncol = 4) +
  theme_minimal() +
  labs(y = "",
  ) +
  theme(
    legend.position = "none",  
    axis.title.y = element_blank(),  
    strip.text = element_text(size = 12, face = "bold")
  ) +
  scale_x_continuous(
    breaks = left_wing_accounts$rank, 
    labels = left_wing_accounts$word
  )

ggsave(paste0(figure_dir, "top_20_recurring_text_left_wing_accounts_long.png"),
       left_wing_text_plot,
       units="in", width=12, height=4, dpi=300,
       bg="white")

# Right wing accounts
right_wing_accounts <- recurring_bio_text_summary %>%
  filter(group == "Right") %>%
  mutate(text_type = recode(text_type, "emojis" = "Emojis", "words" = "Words", "hashtags" = "Hashtags", "mentions" =  'Mentions' ))

right_wing_accounts  <- right_wing_accounts %>%
  mutate(rank = nrow(right_wing_accounts) - row_number() + 1)

right_wing_text_plot <- ggplot(right_wing_accounts, aes(x = rank, y = n)) +
  geom_segment(aes(xend = rank, yend = 0), size = 1, color = "blue") +
  geom_point(size = 3, color = "blue") +
  coord_flip() +
  facet_wrap(~ text_type, scales = "free", ncol = 4) +
  theme_minimal() +
  labs(y = "",
  ) +
  theme(
    legend.position = "none",  
    axis.title.y = element_blank(),  
    strip.text = element_text(size = 12, face = "bold")
  ) +
  scale_x_continuous(
    breaks = right_wing_accounts$rank, 
    labels = right_wing_accounts$word
  )

ggsave(paste0(figure_dir, "top_20_recurring_text_right_wing_accounts_long.png"),
       right_wing_text_plot,
       units="in", width=12, height=4, dpi=300,
       bg="white")

### ---- END ---- ###